RFM stands for Recency - Frequency - Monetary Value. Theoretically we will have segments like below:
Low Value: Customers who are less active than others, not very frequent buyer/visitor and generates very low - zero - maybe negative revenue. Mid Value: In the middle of everything. Often using our platform (but not as much as our High Values), fairly frequent and generates moderate revenue. High Value: The group we don’t want to lose. High Revenue, Frequency and low Inactivity.
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
horizontal-align: middle;
}
h1,h2 {
text-align: center;
background-color: black;
padding: 20px;
margin: 0;
color: yellow;
font-family: ariel;
border-radius: 80px
}
h3 {
text-align: center;
border-style: solid;
border-width: 3px;
padding: 12px;
margin: 0;
color: black;
font-family: ariel;
border-radius: 80px;
border-color: gold;
}
body, p {
font-family: ariel;
font-size: 15px;
color: charcoal;
}
div {
font-size: 14px;
margin: 0;
}
h4 {
padding: 0px;
margin: 0;
font-family: ariel;
color: purple;
}
</style>
""")
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from chart_studio import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
#initiate visualization library for jupyter notebook
pyoff.init_notebook_mode()
df = pd.read_csv('Online_Retail.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceDate'].describe()
count 495478 unique 21220 top 2011-10-31 14:41:00 freq 1114 first 2010-12-01 08:26:00 last 2011-12-09 12:49:00 Name: InvoiceDate, dtype: object
df_uk = df.query("Country=='United Kingdom'").reset_index(drop=True)
df_user = pd.DataFrame(df_uk['CustomerID'].unique())
df_user.columns = ['CustomerID']
df_max_purchase = df_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
df_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days
df_user = pd.merge(df_user, df_max_purchase[['CustomerID','Recency']], on='CustomerID')
df_user.head()
| CustomerID | Recency | |
|---|---|---|
| 0 | 17850.0 | 301 |
| 1 | 13047.0 | 31 |
| 2 | 13748.0 | 95 |
| 3 | 15100.0 | 329 |
| 4 | 15291.0 | 25 |
df_user.Recency.describe()
count 3950.000000 mean 90.778481 std 100.230349 min 0.000000 25% 16.000000 50% 49.000000 75% 142.000000 max 373.000000 Name: Recency, dtype: float64
plot_data = [
go.Histogram(
x=df_user['Recency']
)
]
plot_layout = go.Layout(
title='Recency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We will aplly Elbow Method simply to find the optimal cluster number for optimal inertia.
from sklearn.cluster import KMeans
sse={}
df_recency = df_user[['Recency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
df_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
Here it looks like 3 is the optimal one. Based on business requirements, we can go ahead with less or more clusters. We will be selecting 4.
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Recency']])
df_user['RecencyCluster'] = kmeans.predict(df_user[['Recency']])
df_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 954.0 | 77.679245 | 22.850898 | 48.0 | 59.00 | 72.5 | 93.00 | 131.0 |
| 1 | 478.0 | 304.393305 | 41.183489 | 245.0 | 266.25 | 300.0 | 336.00 | 373.0 |
| 2 | 1950.0 | 17.488205 | 13.237058 | 0.0 | 6.00 | 16.0 | 28.00 | 47.0 |
| 3 | 568.0 | 184.625000 | 31.753602 | 132.0 | 156.75 | 184.0 | 211.25 | 244.0 |
We have added one function to our code which is order_cluster(). K-means assigns clusters as numbers but not in an ordered way. We can’t say cluster 0 is the worst and cluster 4 is the best.
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
df_user = order_cluster('RecencyCluster', 'Recency', df_user, False)
| CustomerID | Recency | RecencyCluster | |
|---|---|---|---|
| 0 | 17850.0 | 301 | 0 |
| 1 | 15100.0 | 329 | 0 |
| 2 | 18074.0 | 373 | 0 |
| 3 | 16250.0 | 260 | 0 |
| 4 | 13747.0 | 373 | 0 |
| ... | ... | ... | ... |
| 3945 | 15942.0 | 133 | 1 |
| 3946 | 14143.0 | 133 | 1 |
| 3947 | 16147.0 | 133 | 1 |
| 3948 | 15149.0 | 133 | 1 |
| 3949 | 15776.0 | 132 | 1 |
3950 rows × 3 columns
df_user.groupby('RecencyCluster')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyCluster | ||||||||
| 0 | 478.0 | 304.393305 | 41.183489 | 245.0 | 266.25 | 300.0 | 336.00 | 373.0 |
| 1 | 568.0 | 184.625000 | 31.753602 | 132.0 | 156.75 | 184.0 | 211.25 | 244.0 |
| 2 | 954.0 | 77.679245 | 22.850898 | 48.0 | 59.00 | 72.5 | 93.00 | 131.0 |
| 3 | 1950.0 | 17.488205 | 13.237058 | 0.0 | 6.00 | 16.0 | 28.00 | 47.0 |
3 covers most recent customers whereas 0 has the most inactive ones.
df_frequency = df_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
df_frequency.columns = ['CustomerID','Frequency']
df_frequency.head()
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12346.0 | 2 |
| 1 | 12747.0 | 103 |
| 2 | 12748.0 | 4642 |
| 3 | 12749.0 | 231 |
| 4 | 12820.0 | 59 |
df_user = pd.merge(df_user, df_frequency, on='CustomerID')
df_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | |
|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 |
| 1 | 15100.0 | 329 | 0 | 6 |
| 2 | 18074.0 | 373 | 0 | 13 |
| 3 | 16250.0 | 260 | 0 | 24 |
| 4 | 13747.0 | 373 | 0 | 1 |
df_user.Frequency.describe()
count 3950.000000 mean 91.614684 std 220.557389 min 1.000000 25% 17.000000 50% 41.000000 75% 101.000000 max 7983.000000 Name: Frequency, dtype: float64
plot_data = [
go.Histogram(
x=df_user.query('Frequency < 1000')['Frequency']
)
]
plot_layout = go.Layout(
title='Frequency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
sse={}
df_frequency = df_user[['Frequency']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_frequency)
df_frequency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyCluster'] = kmeans.predict(df_user[['Frequency']])
df_user.groupby('FrequencyCluster')['Frequency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| FrequencyCluster | ||||||||
| 0 | 429.0 | 331.221445 | 133.856510 | 191.0 | 228.0 | 287.0 | 399.0 | 803.0 |
| 1 | 3.0 | 5917.666667 | 1805.062418 | 4642.0 | 4885.0 | 5128.0 | 6555.5 | 7983.0 |
| 2 | 3496.0 | 49.525744 | 44.954212 | 1.0 | 15.0 | 33.0 | 73.0 | 190.0 |
| 3 | 22.0 | 1313.136364 | 505.934524 | 872.0 | 988.5 | 1140.0 | 1452.0 | 2782.0 |
df_user = order_cluster('FrequencyCluster', 'Frequency', df_user, True)
df_uk['Revenue'] = df_uk['UnitPrice'] * df_uk['Quantity']
df_revenue = df_uk.groupby('CustomerID').Revenue.sum().reset_index()
df_revenue.head()
| CustomerID | Revenue | |
|---|---|---|
| 0 | 12346.0 | 0.00 |
| 1 | 12747.0 | 4196.01 |
| 2 | 12748.0 | 29072.10 |
| 3 | 12749.0 | 3868.20 |
| 4 | 12820.0 | 942.34 |
df_user = pd.merge(df_user, df_revenue, on='CustomerID')
df_user.Revenue.describe()
count 3950.000000 mean 1713.385669 std 6548.608224 min -4287.630000 25% 282.255000 50% 627.060000 75% 1521.782500 max 256438.490000 Name: Revenue, dtype: float64
plot_data = [
go.Histogram(
x=df_user.query('Revenue < 10000')['Revenue']
)
]
plot_layout = go.Layout(
title='Monetary Value'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
import warnings
warnings.filterwarnings("ignore")
sse={}
df_revenue = df_user[['Revenue']]
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_revenue)
df_revenue["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Revenue']])
df_user['RevenueCluster'] = kmeans.predict(df_user[['Revenue']])
df_user = order_cluster('RevenueCluster', 'Revenue', df_user,True)
df_user.groupby('RevenueCluster')['Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RevenueCluster | ||||||||
| 0 | 3687.0 | 907.254414 | 921.910820 | -4287.63 | 263.115 | 572.56 | 1258.220 | 4314.72 |
| 1 | 234.0 | 7760.699530 | 3637.173671 | 4330.67 | 5161.485 | 6549.38 | 9142.305 | 21535.90 |
| 2 | 27.0 | 43070.445185 | 15939.249588 | 25748.35 | 28865.490 | 36351.42 | 53489.790 | 88125.38 |
| 3 | 2.0 | 221960.330000 | 48759.481478 | 187482.17 | 204721.250 | 221960.33 | 239199.410 | 256438.49 |
df_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | |
|---|---|---|---|---|---|---|---|
| 0 | 17850.0 | 301 | 0 | 312 | 1 | 5288.63 | 1 |
| 1 | 14688.0 | 7 | 3 | 359 | 1 | 5107.38 | 1 |
| 2 | 13767.0 | 1 | 3 | 399 | 1 | 16945.71 | 1 |
| 3 | 15513.0 | 30 | 3 | 314 | 1 | 14520.08 | 1 |
| 4 | 14849.0 | 21 | 3 | 392 | 1 | 7904.28 | 1 |
df_user['OverallScore'] = df_user['RecencyCluster'] + df_user['FrequencyCluster'] + df_user['RevenueCluster']
df_user.groupby('OverallScore')['Recency','Frequency','Revenue'].mean()
| Recency | Frequency | Revenue | |
|---|---|---|---|
| OverallScore | |||
| 0 | 304.584388 | 21.995781 | 303.339705 |
| 1 | 185.362989 | 32.596085 | 498.087546 |
| 2 | 78.991304 | 46.963043 | 868.082991 |
| 3 | 20.689610 | 68.419590 | 1091.416414 |
| 4 | 14.892617 | 271.755034 | 3607.097114 |
| 5 | 9.662162 | 373.290541 | 9136.946014 |
| 6 | 7.740741 | 876.037037 | 22777.914815 |
| 7 | 1.857143 | 1272.714286 | 103954.025714 |
| 8 | 1.333333 | 5917.666667 | 42177.930000 |
The scoring above clearly shows us that customers with score 8 is our best customers whereas 0 is the worst.
To keep things simple, better we name these scores:
0 to 2: Low Value 3 to 4: Mid Value 5+: High Value
df_user.groupby('OverallScore')['Recency'].count()
OverallScore 0 474 1 562 2 920 3 1511 4 298 5 148 6 27 7 7 8 3 Name: Recency, dtype: int64
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>2, 'Segment'] = 'Mid-Value'
df_user.loc[df_user['OverallScore']>4, 'Segment'] = 'High-Value'
df_graph = df_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=df_graph.query("Segment == 'Low-Value'")['Frequency'],
y=df_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=df_graph.query("Segment == 'Mid-Value'")['Frequency'],
y=df_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=df_graph.query("Segment == 'High-Value'")['Frequency'],
y=df_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_graph = df_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=df_graph.query("Segment == 'Low-Value'")['Recency'],
y=df_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=df_graph.query("Segment == 'Mid-Value'")['Recency'],
y=df_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=df_graph.query("Segment == 'High-Value'")['Recency'],
y=df_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_graph = df_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=df_graph.query("Segment == 'Low-Value'")['Recency'],
y=df_graph.query("Segment == 'Low-Value'")['Frequency'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=df_graph.query("Segment == 'Mid-Value'")['Recency'],
y=df_graph.query("Segment == 'Mid-Value'")['Frequency'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=df_graph.query("Segment == 'High-Value'")['Recency'],
y=df_graph.query("Segment == 'High-Value'")['Frequency'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Frequency"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
You can see how the segments are clearly differentiated from each other in terms of RFM.